In [9]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
In [10]:
# Showing matplotlib plots in jupyter notebook
%matplotlib inline
In [11]:
# Using patientId for better intitution
# Drawing polar heatmap
# Learning curve with smaller numbers
# How we can use SVR for Regression
# Making module for finding best functino of sklearn
In [12]:
# Getting Training dataset
# Import dataset as Dataframe
df_full = pd.read_csv("../Dataset/slice_localization_data.csv", sep=',')
# Import dataset as numpy array
X_full = np.genfromtxt("../Dataset/slice_localization_data.csv", delimiter=',', skip_header=1)
# Making distnict output column for later uses
y = X_full[:,385]
# Removing first and last column from the dataset
X = X_full[:,1:385]
# Converting X to Dataframe
df = pd.DataFrame(X)
df.columns = df_full.columns[1:385]
# Getting size of the Training Dataset
m,n = X.shape
In [13]:
# Getting shape of the Dataset
X_full.shape
Out[13]:
In [14]:
# Getting first 10 rows of the numpy array
X_full[0:10]
Out[14]:
In [15]:
# Getting first 10 rows of the dataframe
df_full.head(10)
Out[15]:
In [16]:
# Getting some intuition of data
df_full.describe()
Out[16]:
In [17]:
# Gettign unqiue values of the "Patient ID"
np.unique(df_full["patientId"])
Out[17]:
In [18]:
# Import mean_squared_error function from Sklearn Library
from sklearn.metrics import mean_squared_error
def plotLearningCurves(X,y,step):
m,n = X.shape
maxVal = (int)(m / 10) * 10
N_size_arr = np.arange(10, maxVal + 10, step)
error_arr = np.zeros(( len(np.arange(10, maxVal + 10, step)) ,2 ))
index = 0
# Increasing train dataset size, "step" times in each iteration
for i in N_size_arr:
# Splitting Training dataset with size i into train and cross validation sets
X_train, X_test, y_train, y_test = train_test_split(X[:i,:], y[:i], test_size=0.33, random_state=42)
# Fitting Model
lm.fit(X_train, y_train)
# Computing both mean squared error of training dataset and cross validation datasets predections
error_arr[index,0] = mean_squared_error(y_train , lm.predict(X_train))
error_arr[index,1] = mean_squared_error(y_test, lm.predict(X_test))
# Increasing index with 1
index += 1
# Initializing the figure
fig = plt.figure(figsize=(12,8))
ax = fig.add_axes([0,0,1,1])
ax.set_yscale('log')
# Plotting "Training set size" vs. "Mean Squared Error" for both of the train and cross validation dataset's errors
line1, = ax.plot(N_size_arr,error_arr[:,0], c='red')
line2, = ax.plot(N_size_arr,error_arr[:,1], c='blue')
# Adding labels && legends to our plot
ax.set_xlabel("N (Training set size)")
ax.set_ylabel("Mean Squared Error")
ax.legend((line1,line2),("Train Error","Test Error"))
In [19]:
# Plot "Means of each column in dataset( Attributes ) except first and last column" distplot
sns.distplot(df.mean())
# ===> We can conclude all of our attributes are in [-1,1] range, so we don't need to use feature normalize technique
Out[19]:
In [20]:
fig = plt.figure(figsize=(12,10))
axes1 = fig.add_axes([0, 2, 1, 1], projection='polar')
axes2 = fig.add_axes([1, 2,1,1], projection='polar')
axes3 = fig.add_axes([0, 1, 1, 1], projection='polar')
axes4 = fig.add_axes([1, 1,1,1], projection='polar')
# Plotting first example of bone structure of the one person
axes1.plot(X[150,1:241], 'bo', ms=10)
axes1.set_xlabel("Bone Structure")
# Plotting first example of air inclusion of the one person
axes2.plot(X[150,241:386], 'ro', ms=10)
axes2.set_xlabel("Air Inclusion")
# Plotting second example of bone structure of the one person
axes3.plot(X[3541,1:241], 'bo', ms=10)
axes3.set_xlabel("Bone Structure")
# Plotting second example of air inclusion of the one person
axes4.plot(X[3541,241:386], 'ro', ms=10)
axes4.set_xlabel("Air Inclusion")
Out[20]:
In [21]:
# Plot "Reference" column distplot
plt.figure(figsize=(12,8))
sns.distplot(y, bins=100)
# ==> We can see that we don't have any image for locations of body with value bigger than 100, and majority of the
# images are taken with values in [25,40] range
Out[21]:
In [22]:
X.shape
Out[22]:
In [23]:
y.shape
Out[23]:
In [24]:
# Using SkLearn Library "Linear Regression" Method
from sklearn.linear_model import LinearRegression
# Initialization Model
lm = LinearRegression()
In [25]:
# Splitting Data into Training and Cross Validation Datasets
# Using Sklearn library for splitting data
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
In [26]:
# Fitting Model with Train dataset ( Columns except "PatientId", "Reference")
lm.fit(X_train,y_train)
Out[26]:
In [27]:
# Model Thetas
lm.coef_
Out[27]:
In [28]:
# Predecting Reference values with Cross validation dataset
pred = lm.predict(X_test)
# Evaluating error with squared error method without using SkLearn Library
eval_err = np.sum( ( pred - y_test ) ** 2 , axis=0 ) / len(pred)
eval_err
Out[28]:
In [29]:
# Plot predections vs. y_test for better understanding how our model works!
fig = plt.figure(figsize=(12,8))
ax = fig.add_axes([0,0,1,1])
ax.set_xlabel("Predections")
ax.set_ylabel("Test Target Varible")
ax.plot(y_test, pred,'bo',ms=1)
Out[29]:
In [30]:
plotLearningCurves(X,y,1000)
In [31]:
# Checking error on Train set value and Comparing it with Test dataset error
pred_train = lm.predict(X_train)
mean_squared_error(pred_train,y_train)
# ===> We can see both of the MSE on Training Dataset && Cross validation Dataset is high, so maybe algorithm suffers from High bias
Out[31]:
In [32]:
# Normalize features but maybe it doesn't help, but it can help for decreasing time of the fitting algorithm
lm.normalize = True
lm.fit(X_train,y_train)
Out[32]:
In [33]:
pred_train = lm.predict(X_train)
mean_squared_error(pred_train, y_train)
# ===> We can see normalizing doesn't help
Out[33]:
In [34]:
# Using other regression model of sklearn
from sklearn.linear_model import SGDRegressor
sg = SGDRegressor(alpha=0,max_iter=10000,n_iter=10000,learning_rate='constant',eta0=0.0001)
In [35]:
# Fitting Model
sg.fit(X_train,y_train)
Out[35]:
In [36]:
# Predecting Using new model ( SGDRegressor )
pred = sg.predict(X_test)
In [37]:
# Evaluating model
mean_squared_error(y_test,pred)
Out[37]:
In [38]:
# Plot predections vs. y_test for better understanding how our model works!
fig = plt.figure(figsize=(12,8))
ax = fig.add_axes([0,0,1,1])
ax.set_xlabel("Predections")
ax.set_ylabel("Test Target Varible")
ax.plot(y_test, pred,'bo',ms=1)
Out[38]:
In [39]:
# Because train Error && test Error are very close to each other and they are almost high so we can conclude we are suffering \
# from high bias, and we should add extra features
# In the last try, We saw that more iterations doesn't help so much \
# So we try more features
# Function for finding best degree and alplha
# Before running this function we should import following functions
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn import svm
# Version 1.0.0
def findBestRegressorModel(X,y,sample_d= np.arange(1,11,1) ,sample_alpha= None, _kernel= 'rbf'):
# sample_d should be tuple with size of 2
if( type(sample_d) == tuple ):
if( len(sample_d) != 2):
raise ValueError("sample_d length should be 2 !!!")
else:
raise ValueError("sample_d should be tuple !!!")
# Finding which trainer model is better to use
trainer = findBestTrainer(X)
# Defining default value of "lambda of regularization part in loss function ( alpha )"
if( sample_alpha == None ):
sample_alpha = np.array([0.0001,0.001,0.003,0.01,0.03,0.1,0.3,1,3,10])
# Container of the model error in each degree ( We'll use last two column for holding lambda and degree values \
# in each iteration )
error_tracker = np.zeros(( len(sample_d), 4))
# Spliting data into Train, Cross Validation, Test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# Creating artificial new features with degrees defined in sample_d array
# Container of our artificial created new features
A_train = extraFeatureCreature(X_train.copy(), sample_d)
for deg in sample_d:
# Spliting data into Train, Cross Validation dataset
X_train, X_cross, y_train, y_cross = train_test_split(A_train[deg - 1], y, test_size=0.2)
if( trainer == "SGDRegressor" ):
# It's better to use SGDRegressor
for lam in sample_alpha:
sg = SGDRegressor(alpha=lam, max_iter=1000, n_iter=1000)
sg.fit(X_train,y_train)
pred_train = sg.predict(X_train)
pred_cross = sg.predict(X_cross)
MSE_train = mean.squared_error(pred_train, y_train)
MSE_cross = mean_squared_error(pred_cross, y_cross)
error_tracker[deg - 1][0] = MSE_train
error_tracker[deg - 1][1] = MSE_cross
error_tracker[deg - 1][2] = lam
error_tracker[deg - 1][3] = deg
elif ( trainer == "SVR_Ridge" ):
# It's better to use "Linear SVR Regressor" or "rbf SVR Regressor"
# We use "rbf" kernel because it usually give better predections, but obviously since it's more complicated model \
# it takes much more time
# Initializing SVR Model
clf = svm.SVR(kernel= _kernel, max_iter=1000)
# Fitting model with training set
clf.fit(X_train,y_train)
# Predecting outputs
pred_train = clf.predict(X_train)
pred_cross = clf.predict(X_cross)
# Evaluating model
MSE_train = mean.squared_error(pred_train, y_train)
MSE_cross = mean_squared_error(pred_cross, y_cross)
# Tracking error in each step
# Note: Because we don't use any lambda in SVR model, so we don't need to add any \
# values to third column of the error_tracker array
error_tracker[deg - 1][0] = MSE_train
error_tracker[deg - 1][1] = MSE_cross
error_tracker[deg - 1][3] = deg
else:
# In later Versions of the module, we will use one of the \
# SkLearn "Lassor" or "ElasticNet" Regressors if prior conditions are passed
# but for now, we give up this with informing user to use one of the "Lasso" or "ElasticNet" Regressors
print("It's better to use 'Lasso' or 'ElasticNet' Regressors!")
# Creating extra features for test dataset
A_test = extraFeatureCreature(X_test.copy(), sample_d)
# Plot "Degree of model" vs. "Train and Test Error" for finding best possible model visually
twoPlotsInOne(X1= error_tracker[:,3], \
y1= error_tracker[:,0], y2= error_tracker[:,1], \
X_label= "D (Degree of the model)", y_label= "Train && Test Error", \
first_legend= "Train Error", second_legend= "Test Error")
# Checking which model has been used
if( trainer == "SGDRegressor" ):
# SkLearn SGDRegressor Model has been used
# Finding best lambda and Degree
best_lam = error_tracker[np.argmin(error_tracker[:,1])][2]
best_deg = error_tracker[np.argmin(error_tracker[:,1])][3]
sg = SGDRegressor(alpha=best_lam, max_iter=1000, n_iter=1000)
sg.fit(A_train[best_deg - 1],y_train)
pred = sg.predict(A_test[best_deg - 1])
elif( trainer == "SVR_Ridge" ):
# "SVR" or "Ridge" Regressors has been used !!!
# Finding best Degree
best_deg = error_tracker[np.argmin(error_tracker[:,1])][3]
best_lam = None
# Initializing SVR Model
clf = svm.SVR(kernel= _kernel, max_iter=1000)
clf.fit(A_train[best_deg - 1],y_train)
pred = clf.predict(A_test[best_deg - 1])
else:
# "Lasso" or "ElasticNet" has been used !!!
# In Later versions of this module, we will implement this section also \
return "It's better to use 'Lasso' or 'ElasticNet' Regressors!"
# Checking how much good do our model generalize !!
test_error = mean_squared_error(y_test,pred)
return { "Degree" : best_deg, "Lambda": best_lam, "Test Error": test_error }
def findBestTrainer(X):
# This threshhold is inspired by Sklearn flow chart
# Link for more information : http://scikit-learn.org/stable/tutorial/machine_learning_map/index.html
if( len(X) > 100000 ):
return "SGDRegressor"
else:
# Checking how many features is important !!!
# This proof is inspired by Sklearn flow chart also.
# Link for more information : http://scikit-learn.org/stable/tutorial/machine_learning_map/index.html
# In later versions of this function we should make better logic for this.
if(X.shape[1] < 100):
# It's better to use SkLearn "Lasso" or "ElasticNet" Regressors
return "Lasso_ElasticNet"
else:
# It's better to use SkLearn "SVR" or "RidgeRegressors" Regressors
return "SVR_Ridge"
def extraFeatureCreature(X, sample_d = np.arange(1,11,1)):
# Container of our artificial created new features
A = []
# Creating artificial new features with degrees defined in sample_d array
for deg in sample_d:
# No creating new feautre when degree is as same as our real Data
if(deg != 1):
X = np.concatenate((X, np.power(X, deg)))
# Appending new model with new degree
A.append(X)
def twoPlotsInOne(X1,y1,y2,X2= None, \
c1= 'red',c2= 'blue', \
X_label= None, y_label= None, \
first_legend= None, second_legend= None):
# Defining size of the plot
fig = plt.figure(figsize=(12,8))
ax = fig.add_axes([0,0,1,1])
# Plotting data's
line1, = ax.plot( X1 ,y1,c=c1)
line2, = ax.plot( X1 if X2 == None else X2 ,y2,c=c2)
# Adding 'x' axis label
ax.set_xlabel(X_label)
# Adding 'y' axis label
ax.set_ylabel(y_label)
# Adding Legends to our plot
ax.legend((line1,line2),(first_legend,second_legend))
In [ ]: